In [1]:
from IPython.display import IFrame, display, HTML

import pandas as pd
import numpy as np

from bokeh.embed import file_html
from bokeh.models import ColumnDataSource, Plot, Circle, Range1d, LinearAxis, TapTool, HoverTool, Text
from bokeh.models.actions import Callback
from bokeh.models.widgets import Slider
from bokeh.palettes import Spectral6
from bokeh.plotting import vplot
from bokeh.resources import INLINE

Get the Data


In [2]:
# Links via http://www.gapminder.org/data/ 
"""
population_url = "http://spreadsheets.google.com/pub?key=phAwcNAVuyj0XOoBL_n5tAQ&output=xls"
fertility_url = "http://spreadsheets.google.com/pub?key=phAwcNAVuyj0TAlJeCEzcGQ&output=xls"
life_expectancy_url = "http://spreadsheets.google.com/pub?key=tiAiXcrneZrUnnJ9dBU-PAw&output=xls"

def get_data(url):
    # Get the data from the url and return only 1962 - 2013
    df = pd.read_excel(url, index_col=0)
    df = df.unstack().unstack()
    df = df[(df.index >= 1964) & (df.index <= 2013)]
    df = df.unstack().unstack()    
    return df

fertility_df = get_data(fertility_url)
life_expectancy_df = get_data(life_expectancy_url)
population_df = get_data(population_url)

fertility_df.to_hdf('fertility_df.hdf', 'df')
life_expectancy_df.to_hdf('life_expectancy_df.hdf', 'df')
population_df.to_hdf('population_df.hdf', 'df')
"""
fertility_df = pd.read_hdf('fertility_df.hdf', 'df')
life_expectancy_df = pd.read_hdf('life_expectancy_df.hdf', 'df')
population_df = pd.read_hdf('population_df.hdf', 'df')

In [3]:
# have common countries across all data
fertility_df = fertility_df.drop(fertility_df.index.difference(life_expectancy_df.index))
population_df = population_df.drop(population_df.index.difference(life_expectancy_df.index))

# get a size value based on population, but don't let it get too small
population_df_size = np.sqrt(population_df/np.pi)/200
min_size = 3
population_df_size = population_df_size.where(population_df_size >= min_size).fillna(min_size)

Get the regions and color them


In [6]:
regions_url = "https://docs.google.com/spreadsheets/d/1OxmGUNWeADbPJkQxVPupSOK5MbAECdqThnvyPrwG5Os/pub?gid=1&output=xls"
regions_df = pd.read_excel(regions_url, index_col=0)
regions_df = regions_df.drop(regions_df.index.difference(life_expectancy_df.index))
regions_df.Group = regions_df.Group.astype('category')
cats = list(regions_df.Group.cat.categories)
def get_color(r):
    index = cats.index(r.Group)
    return Spectral6[cats.index(r.Group)]
regions_df['region_color'] = regions_df.apply(get_color, axis=1)


---------------------------------------------------------------------------
gaierror                                  Traceback (most recent call last)
/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1181             try:
-> 1182                 h.request(req.get_method(), req.selector, req.data, headers)
   1183             except OSError as err: # timeout error

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/http/client.py in request(self, method, url, body, headers)
   1087         """Send a complete request to the server."""
-> 1088         self._send_request(method, url, body, headers)
   1089 

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/http/client.py in _send_request(self, method, url, body, headers)
   1125             body = body.encode('iso-8859-1')
-> 1126         self.endheaders(body)
   1127 

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/http/client.py in endheaders(self, message_body)
   1083             raise CannotSendHeader()
-> 1084         self._send_output(message_body)
   1085 

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/http/client.py in _send_output(self, message_body)
    921             message_body = None
--> 922         self.send(msg)
    923         if message_body is not None:

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/http/client.py in send(self, data)
    856             if self.auto_open:
--> 857                 self.connect()
    858             else:

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/http/client.py in connect(self)
   1222 
-> 1223             super().connect()
   1224 

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/http/client.py in connect(self)
    833         self.sock = self._create_connection((self.host,self.port),
--> 834                                             self.timeout, self.source_address)
    835 

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/socket.py in create_connection(address, timeout, source_address)
    493     err = None
--> 494     for res in getaddrinfo(host, port, 0, SOCK_STREAM):
    495         af, socktype, proto, canonname, sa = res

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/socket.py in getaddrinfo(host, port, family, type, proto, flags)
    532     addrlist = []
--> 533     for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
    534         af, socktype, proto, canonname, sa = res

gaierror: [Errno -2] Name or service not known

During handling of the above exception, another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-6-72947d6962b7> in <module>()
      1 regions_url = "https://docs.google.com/spreadsheets/d/1OxmGUNWeADbPJkQxVPupSOK5MbAECdqThnvyPrwG5Os/pub?gid=1&output=xls"
----> 2 regions_df = pd.read_excel(regions_url, index_col=0)
      3 regions_df = regions_df.drop(regions_df.index.difference(life_expectancy_df.index))
      4 regions_df.Group = regions_df.Group.astype('category')
      5 cats = list(regions_df.Group.cat.categories)

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/site-packages/pandas/io/excel.py in read_excel(io, sheetname, **kwds)
    149     engine = kwds.pop('engine', None)
    150 
--> 151     return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
    152 
    153 

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/site-packages/pandas/io/excel.py in __init__(self, io, **kwds)
    183         if isinstance(io, compat.string_types):
    184             if _is_url(io):
--> 185                 data = _urlopen(io).read()
    186                 self.book = xlrd.open_workbook(file_contents=data)
    187             else:

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    159     else:
    160         opener = _opener
--> 161     return opener.open(url, data, timeout)
    162 
    163 def install_opener(opener):

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/urllib/request.py in open(self, fullurl, data, timeout)
    461             req = meth(req)
    462 
--> 463         response = self._open(req, data)
    464 
    465         # post-process response

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/urllib/request.py in _open(self, req, data)
    479         protocol = req.type
    480         result = self._call_chain(self.handle_open, protocol, protocol +
--> 481                                   '_open', req)
    482         if result:
    483             return result

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    439         for handler in handlers:
    440             func = getattr(handler, meth_name)
--> 441             result = func(*args)
    442             if result is not None:
    443                 return result

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/urllib/request.py in https_open(self, req)
   1223         def https_open(self, req):
   1224             return self.do_open(http.client.HTTPSConnection, req,
-> 1225                 context=self._context, check_hostname=self._check_hostname)
   1226 
   1227         https_request = AbstractHTTPHandler.do_request_

/opt/miniconda/envs/ipython_bokeh_34/lib/python3.4/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1182                 h.request(req.get_method(), req.selector, req.data, headers)
   1183             except OSError as err: # timeout error
-> 1184                 raise URLError(err)
   1185             r = h.getresponse()
   1186         except:

URLError: <urlopen error [Errno -2] Name or service not known>

Build the plot

We build it using html to use the html slider widget instead of the ipython widget. This makes the plot more re-useable and means the slider will work on NBViewer.


In [7]:
# Set up the data. 
#
# We make a dictionary of sources that can then be passed to the callback so they are ready for JS object to use.
#
# Dictionary_of_sources is:
# {
#   1962: '_1962',
#   1963: '_1963',
#   ....
# }
# We turn this into a string  and replace '_1962' with _1962. So the end result is js_source_array:
# '{1962: _1962, 1963: _1963, ....}'
#
# When this is passed into the callback and then accessed at runtime,
# the _1962, _1963 are replaced with the actual source objects that are passed in as args.

sources = {}

years = list(fertility_df.columns)

region_color = regions_df['region_color']
region_color.name = 'region_color'


for year in years:
    fertility = fertility_df[year]
    fertility.name = 'fertility'
    life = life_expectancy_df[year]
    life.name = 'life' 
    population = population_df_size[year]
    population.name = 'population' 
    new_df = pd.concat([fertility, life, population, region_color], axis=1)
    sources['_' + str(year)] = ColumnDataSource(new_df)

dictionary_of_sources = dict(zip([x for x in years], ['_%s' % x for x in years]))
js_source_array = str(dictionary_of_sources).replace("'", "")


# Set up the plot
xdr = Range1d(1, 8)
ydr = Range1d(20, 85)
plot = Plot(
    x_range=xdr,
    y_range=ydr,
    title="",
    plot_width=800,
    plot_height=400,
    outline_line_color=None,
    toolbar_location=None,
)
xaxis = LinearAxis()
yaxis = LinearAxis()   
plot.add_layout(xaxis, 'left')
plot.add_layout(yaxis, 'below')
tooltips = "@index"
plot.add_tools(HoverTool(tooltips=tooltips))

# Add the circle
renderer_source = sources['_%s' % years[0]]
highlighted = Circle(
    x='fertility', y='life', size='population',
    fill_color='region_color', fill_alpha=0.5, 
    line_color='#7c7e71', line_width=0.5, line_alpha=0.5)
plot.add_glyph(renderer_source, highlighted)



# Add the slider
code = """
    var key = slider.get('value'),
        sources = %s,
        new_source_data = sources[key].get('data');
    renderer_source.set('data', new_source_data);
    renderer_source.trigger('change');
""" % js_source_array

callback = Callback(args=sources, code=code)
slider = Slider(start=years[0], end=years[-1], value=1, step=1, title="Year", callback=callback)
callback.args["slider"] = slider
callback.args["renderer_source"] = renderer_source


layout = vplot(plot, slider)
html = file_html(layout, INLINE, "gapminder")

In [8]:
display(HTML(html))


gapminder

TO DO

  • Add big-gray year in background
  • Country legend
  • Clean-up axes
  • Style slider
  • Style hover
  • Add play button (optional)

In [ ]: